Here, we will transfer the label and UMAP coordinates from the discovery cohort to the validation cohort. We will do it one compartment at a time (here, follicular dendritic cells, pdc), so we can validate that the transfer is robust.
Specifically, we will follow these steps for every compartment:
library(Seurat)
library(tidyverse)
library(caret)
library(class)
library(harmony)
library(here)
library(glue)
library(DT)
source(here("scRNA-seq/bin/SLOcatoR_functions.R"))
source(here("scRNA-seq/bin/utils.R"))
source(here("scRNA-seq/bin/utils_final_clusters.R"))
color_palette <- c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D", "#5A5156", "#AA0DFE",
"#F8A19F", "#F7E1A0", "#1C8356", "#FEAF16", "#822E1C", "#C4451C",
"#1CBE4F", "#325A9B", "#F6222E", "#FE00FA", "#FBE426", "#16FF32",
"black", "#3283FE", "#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3",
"#90AD1C", "#FE00FA", "#85660D", "#3B00FB", "#822E1C", "coral2",
"#1CFFCE", "#1CBE4F", "#3283FE", "#FBE426", "#F7E1A0", "#325A9B",
"#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD", "#FEAF16", "#683B79",
"#B10DA1", "#1C7F93", "#F8A19F", "dark orange", "#FEAF16", "#FBE426",
"Brown")
# Read data, clean and include annotation
pdc <- readRDS(here("scRNA-seq/results/R_objects/seurat_objects_revision/4.3-PDC_seurat_object_discovery_validation_cohorts.rds"))
pdc <- updateAnnotation(pdc)
table(pdc$assay)
##
## 3P multiome
## 1114 275
pdc <- pdc[, pdc$assay == "3P"]
pdc$annotation_20230508 <- pdc$annotation_20220619
pdc$annotation_20230508[is.na(pdc$annotation_20230508)] <- "unannotated"
pdc$annotation_20230508 <- factor(
pdc$annotation_20230508,
levels = c(names(colors_20230508[["PDC"]]), "unannotated")
)
Idents(pdc) <- "annotation_20230508"
pdc$is_reference <- ifelse(
pdc$cohort_type == "discovery",
"reference",
"query"
)
DimPlot(
pdc,
group.by = "annotation_20230508",
split.by = "is_reference",
cols = colors_20230508[["PDC"]]
)
DimPlot(
pdc,
group.by = "assay",
split.by = "is_reference"
)
We see that we have too few multiome cells for a proper label transfer. Thus, we will label them as “pdc”, and focus on transering the label from the discovery scRNA-seq dataset to validation scRNA-seq. Let us integrate again this dataset:
Idents(pdc) <- "gem_id"
shared_hvg <- find_assay_specific_features(pdc, assay_var = "cohort_type")
print(glue("The number of shared HVG is: {length(shared_hvg)}"))
## The number of shared HVG is: 2189
table(pdc$cohort_type)
##
## discovery validation
## 322 792
pdc <- integrate_assays(
pdc,
assay_var = "cohort_type",
shared_hvg = shared_hvg
)
pdc <- RunUMAP(pdc, dims = 1:25, reduction = "harmony")
DimPlot(
pdc,
group.by = "annotation_20230508",
split.by = "cohort_type",
cols = colors_20230508[["PDC"]]
)
DimPlot(
pdc,
group.by = "cohort_type"
)
FeaturePlot(pdc, c("CD79B", "CD3G")) &
scale_color_viridis_c(option = "magma")
pdc <- FindNeighbors(pdc, reduction = "harmony", dims = 1:25, k.param = 10)
pdc <- FindClusters(pdc, resolution = 0.8)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1114
## Number of edges: 20405
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7701
## Number of communities: 11
## Elapsed time: 0 seconds
DimPlot(pdc, cols = color_palette)
We see that there are two clusters of doublets with B and T cells (5, 10). Let’s exclude them and reintegrate:
markers_5 <- FindMarkers(pdc, ident.1 = "5", only.pos = TRUE, logfc.threshold = 0.7)
DT::datatable(markers_5)
markers_10 <- FindMarkers(pdc, ident.1 = "10", only.pos = TRUE, logfc.threshold = 0.7)
DT::datatable(markers_10)
pdc <- pdc[, !(pdc$seurat_clusters %in% c("5", "10") & pdc$cohort_type == "validation")]
DimPlot(pdc)
table(pdc$cohort_type)
##
## discovery validation
## 322 697
shared_hvg <- find_assay_specific_features(pdc, assay_var = "cohort_type")
print(glue("The number of shared HVG is: {length(shared_hvg)}"))
## The number of shared HVG is: 2227
table(pdc$cohort_type)
##
## discovery validation
## 322 697
pdc <- integrate_assays(
pdc,
assay_var = "cohort_type",
shared_hvg = shared_hvg
)
pdc <- RunUMAP(pdc, dims = 1:25, reduction = "harmony")
DimPlot(
pdc,
group.by = "annotation_20230508",
split.by = "cohort_type",
cols = colors_20230508[["PDC"]]
)
Now we can transfer the label and UMAP coords
# Split training and test sets
data_sets <- split_training_and_test_sets(
pdc,
split_var = "cohort_type",
referece_label = "discovery",
query_label = "validation",
reduction = "harmony",
n_dims = 25
)
# Transfer label
annotation_query_df <- transfer_label(
seurat_obj = pdc,
training_set = data_sets$training_set,
test_set = data_sets$test_set,
k = 8,
response_var = "annotation_20230508"
)
# Transfer UMAP coords
# umap_test_df <- transfer_umap_coords(
# seurat_obj = pdc,
# training_set = data_sets$training_set,
# test_set = data_sets$test_set,
# umap1_var = "UMAP_1_20220215",
# umap2_var = "UMAP_2_20220215",
# k = 15
# )
# Include annotation and UMAP coords in Seurat object
pdc$annotation_20230508[annotation_query_df$query_cells] <- annotation_query_df$annotation
Idents(pdc) <- "annotation_20230508"
pdc$annotation_20230508_probability <- NA
pdc$annotation_20230508_probability[annotation_query_df$query_cells] <- annotation_query_df$annotation_prob
# pdc$UMAP_1_20220215[umap_test_df$query_cells] <- umap_test_df$UMAP1
# pdc$UMAP_2_20220215[umap_test_df$query_cells] <- umap_test_df$UMAP2
p1 <- FeaturePlot(
pdc[, annotation_query_df$query_cells],
"annotation_20230508_probability"
) + scale_color_viridis_c(option = "magma")
p2 <- FeaturePlot(
pdc[, annotation_query_df$query_cells],
"doublet_score_scDblFinder"
) + scale_color_viridis_c(option = "magma")
p1 | p2
DimPlot(
pdc,
group.by = "annotation_20230508",
split.by = "cohort_type",
cols = colors_20230508[["PDC"]]
)
table(pdc$annotation_20230508_probability)
##
## 0.5 0.625 0.75 0.875 1
## 9 23 48 148 469
FeaturePlot(pdc, c("nFeature_RNA", "pct_mt"))
We see that there is a cluster with hallmarks of poor-quality cells: high % of mitochondrial expression and low number of detected features. We will exclude it:
pdc <- FindNeighbors(pdc, k.param = 15, dims = 1:20, reduction = "harmony")
pdc <- FindClusters(pdc, resolution = 0.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1019
## Number of edges: 40431
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8597
## Number of communities: 4
## Elapsed time: 0 seconds
DimPlot(pdc)
markers_2 <- FindMarkers(pdc, ident.1 = "2", only.pos = TRUE)
DT::datatable(markers_2)
pdc <- pdc[, !(pdc$cohort_type == "validation" & pdc$seurat_clusters == "2")]
DimPlot(pdc, cols = color_palette)
goi <- c("IRF8", "IL3RA", "LILRA4", "IFI44", "IFI6")
FeaturePlot(pdc, goi) &
scale_color_viridis_c(option = "magma")
DimPlot(pdc, group.by = "annotation_20230508", cols = colors_20230508$PDC)
saveRDS(pdc, here("scRNA-seq/results/R_objects/seurat_objects_revision/5.3-PDC_seurat_object_discovery_validation_cohorts_annotation.rds"))
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=es_ES.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=es_ES.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=es_ES.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] DT_0.27 glue_1.6.2 here_1.0.1 harmony_0.1.1 Rcpp_1.0.10 class_7.3-21 caret_6.0-93 lattice_0.20-45 forcats_1.0.0 stringr_1.5.0 dplyr_1.1.0 purrr_1.0.1 readr_2.1.3 tidyr_1.3.0 tibble_3.1.8 ggplot2_3.4.0 tidyverse_1.3.2 SeuratObject_4.1.3 Seurat_4.3.0 BiocStyle_2.26.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.1 backports_1.4.1 plyr_1.8.8 igraph_1.3.5 lazyeval_0.2.2 sp_1.6-0 splines_4.2.0 crosstalk_1.2.0 listenv_0.9.0 scattermore_0.8 digest_0.6.31 foreach_1.5.2 htmltools_0.5.4 fansi_1.0.4 magrittr_2.0.3 tensor_1.5 googlesheets4_1.0.1 cluster_2.1.4 ROCR_1.0-11 limma_3.54.1 tzdb_0.3.0 recipes_1.0.4 globals_0.16.2 gower_1.0.1 modelr_0.1.10 matrixStats_0.63.0 hardhat_1.2.0 timechange_0.2.0 spatstat.sparse_3.0-0 colorspace_2.1-0 rvest_1.0.3 ggrepel_0.9.3 haven_2.5.1 xfun_0.37 crayon_1.5.2 jsonlite_1.8.4 progressr_0.13.0 spatstat.data_3.0-0 iterators_1.0.14 survival_3.5-0 zoo_1.8-11 polyclip_1.10-4 gtable_0.3.1 gargle_1.3.0 ipred_0.9-13 leiden_0.4.3 future.apply_1.10.0 abind_1.4-5 scales_1.2.1 DBI_1.1.3 spatstat.random_3.1-3
## [52] miniUI_0.1.1.1 viridisLite_0.4.1 xtable_1.8-4 reticulate_1.28 stats4_4.2.0 lava_1.7.1 prodlim_2019.11.13 htmlwidgets_1.6.1 httr_1.4.4 RColorBrewer_1.1-3 ellipsis_0.3.2 ica_1.0-3 farver_2.1.1 pkgconfig_2.0.3 nnet_7.3-18 sass_0.4.5 uwot_0.1.14 dbplyr_2.3.2 deldir_1.0-6 utf8_1.2.3 labeling_0.4.2 tidyselect_1.2.0 rlang_1.0.6 reshape2_1.4.4 later_1.3.0 munsell_0.5.0 cellranger_1.1.0 tools_4.2.0 cachem_1.0.6 cli_3.6.0 generics_0.1.3 broom_1.0.3 ggridges_0.5.4 evaluate_0.20 fastmap_1.1.0 yaml_2.3.7 goftest_1.2-3 ModelMetrics_1.2.2.2 knitr_1.42 fs_1.6.0 fitdistrplus_1.1-8 RANN_2.6.1 pbapply_1.7-0 future_1.31.0 nlme_3.1-162 mime_0.12 xml2_1.3.3 compiler_4.2.0 rstudioapi_0.14 plotly_4.10.1 png_0.1-8
## [103] spatstat.utils_3.0-1 reprex_2.0.2 bslib_0.4.2 stringi_1.7.12 highr_0.10 Matrix_1.5-3 vctrs_0.5.2 pillar_1.8.1 lifecycle_1.0.3 BiocManager_1.30.19 spatstat.geom_3.0-6 lmtest_0.9-40 jquerylib_0.1.4 RcppAnnoy_0.0.20 data.table_1.14.6 cowplot_1.1.1 irlba_2.3.5.1 httpuv_1.6.8 patchwork_1.1.2 R6_2.5.1 bookdown_0.32 promises_1.2.0.1 KernSmooth_2.23-20 gridExtra_2.3 parallelly_1.34.0 codetools_0.2-19 MASS_7.3-58.2 rprojroot_2.0.3 withr_2.5.0 sctransform_0.3.5 parallel_4.2.0 hms_1.1.2 rpart_4.1.19 timeDate_4022.108 grid_4.2.0 rmarkdown_2.20 googledrive_2.0.0 Rtsne_0.16 pROC_1.18.0 spatstat.explore_3.0-6 shiny_1.7.4 lubridate_1.9.1